import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# load data and get some insight
pdata = pd.read_excel("data.xlsx")
pdata.head()
# Remove 0 and 1 indices since those are not needed
pdata.drop(pdata.columns[[0,1]], inplace=True, axis=1)
unique_values = pdata.nunique()
print('Count unique values in each column')
print(unique_values)
pdata.info
pdata.describe().transpose()
print("Missing values:\n{}".format(pdata.isnull().sum()))
# Genarate pairplot to see multivariate
sns.pairplot(pdata, diag_kind='kde')
# Based on the kde plots above, we can work with 2 or 3 clusters
from scipy.stats import zscore
pdata_z = pdata.apply(zscore)
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_score
clusters=range(1,10)
meanDistortions=[]
for k in clusters:
model=KMeans(n_clusters=k)
model.fit(pdata)
meanDistortions.append(sum(np.min(cdist(pdata, model.cluster_centers_, 'euclidean'), axis=1)) / pdata
.shape[0])
plt.plot(clusters, meanDistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
#Set the value of k=3 base on above elbow plot
kmeans = KMeans(n_clusters=3, n_init = 15, random_state=2345)
# Compute K-means clustering
kmeans.fit(pdata_z)
centroids = kmeans.cluster_centers_
centroids
centroid_df = pd.DataFrame(centroids, columns = list(pdata_z) )
print(centroid_df)
# Calculate silhouette_avg for K-means clustering
silhouette_avg = silhouette_score(pdata_z, kmeans.labels_)
print("silhouette_avg for K-means is:{}".format(silhouette_avg))
# Cluster 0 has the lowest values and cluster 2 has the highest values
## creating a new dataframe only for labels and converting it into categorical variable
df_labels = pd.DataFrame(kmeans.labels_ , columns = list(['labels']))
df_labels['labels'] = df_labels['labels'].astype('category')
df_labeled = pdata.join(df_labels)
df_analysis = (df_labeled.groupby(['labels'] , axis=0)).head(4177)
df_analysis
df_labeled['labels'].value_counts()
cluster_count = sns.countplot(x="labels", data=df_analysis)
# Box plot to visualize Cluster Id vs Total visit bank
sns.boxplot(x='labels', y='Total_visits_bank', data=df_analysis)
# Box plot to visualize Cluster Id vs Total visit online
sns.boxplot(x='labels', y='Total_visits_online', data=df_analysis)
# Box plot to visualize Cluster Id vs Total call made
sns.boxplot(x='labels', y='Total_calls_made', data=df_analysis)
sns.boxplot(x='labels', y='Total_Credit_Cards', data=df_analysis)
sns.boxplot(x='labels', y='Avg_Credit_Limit', data=df_analysis)
from sklearn.cluster import AgglomerativeClustering
sns.pairplot(pdata_z, height=2,aspect=2 , diag_kind='kde')
model = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='average')
model.fit(pdata_z)
# Calculate silhouette_avg for Hierachy clustering
silhouette_avg = silhouette_score(pdata_z, model.labels_)
print("silhouette_avg for Hierachy clustering is:{}".format(silhouette_avg))
pdata_z['labels'] = model.labels_
pdata_z.head(10)
cluster_count = sns.countplot(x="labels", data=pdata_z)
# Pairplot between clusters (labels) and Total CreditCards
sns.boxplot(x='labels', y='Total_Credit_Cards', data=pdata_z)
# Pairplot between clusters (labels) and Total Visit Bank
sns.boxplot(x='labels', y='Total_visits_bank', data=pdata_z)
# Pairplot between clusters (labels) and Total Visit Online
sns.boxplot(x='labels', y='Total_visits_online', data=pdata_z)
# Pairplot between clusters (labels) and Total Calls Made
sns.boxplot(x='labels', y='Total_calls_made', data=pdata_z)
# Pairwise distribution between data points
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
# cophenet index for average method, closer it is to 1, the better is the clustering
Z = linkage(pdata_z, metric='euclidean', method='average')
c, coph_dists = cophenet(Z , pdist(pdata_z))
c
plt.figure(figsize=(10, 5))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z, leaf_rotation=90.,color_threshold = 40, leaf_font_size=8. )
plt.tight_layout()
# cophenet index for complete method, closer it is to 1, the better is the clustering
Z = linkage(pdata_z, metric='euclidean', method='complete')
c, coph_dists = cophenet(Z , pdist(pdata_z))
c
plt.figure(figsize=(10, 5))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z, leaf_rotation=90.,color_threshold=90, leaf_font_size=10. )
plt.tight_layout()
How many different segments of customers are there? There are 3 of them
How are these segments different from each other?
Group 0: Customers who have low average credit limit and low credit cards tend to make more phone calls than visit online or go to the bank.
Group 1: Customers who have average credit limit (up to 100k) make less phone call, less visit online from group 0, and go to the bank the most. This is also most of the bank's customers belong to this group.
Group 2: Customers who have the highest credit limit, make the least phone calls and go to the bank less than other 2 groups above but using online the most
What are your recommendations to the bank on how to better market to and service these customers? As we can see above, we can market online banking to group 0 and group 1 since those are potential customers for online banking services and might or might not sign up for paperless services. And also for group 0, they use a lot of phone calls so we can focus on how to maintain and keep high quality of phone customer services for them and acquire more users from group 1